Load Packages

library(DT)
library(dplyr)
library(readxl)
library(plotly)
library(skimr)
library(ggplot2)
library(gridExtra)
library(tidyr)
library(GGally)
library(ggcorrplot)

Read in and inspect data

file_path = "../data/master_df.xlsx"
df = read_excel(file_path)
datatable(head(df, 10),
          options = list(scrollX = TRUE,     
                         pageLength = 5,            
                         autoWidth = TRUE,
                         searching = FALSE),
          caption = "Preview of Trade and Congressional Data")

Get summary statistics

skim_df =  skim(df) %>%
            rename(variable_type = skim_type,
                   variable_name = skim_variable,
                   missing_values = n_missing,
                   min_value = POSIXct.min,
                   max_value = POSIXct.max,
                   median_value = POSIXct.median,
                   unique_values = POSIXct.n_unique,
                   )


datatable(skim_df, options = list(scrollX = TRUE,                
                                  pageLength = 10,        
                                  autoWidth = TRUE,             
                                  searching = FALSE),
          caption = "Summary Statistics for Data")

Visualizations for numeric values

numerical_vars = c("trade_price", "price_change_pct", "seniority_years", "excess_return")

create_histogram = function(var) {ggplot(df, aes(x = .data[[var]])) + 
                      geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", color = "black", alpha = 0.7) +
                      geom_density(color = "red", size = 1) +
                      labs(title = paste("Distribution of", var), x = var, y = "Density") +
                      theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14),
                                              axis.text = element_text(size = 10),    
                                              axis.title = element_text(size = 12))}

histogram_plots = lapply(numerical_vars, create_histogram)
grid.arrange(grobs = histogram_plots, ncol = 2)

Visualizations for categorical values

create_interactive_barplot = function(var) {
                                plot_ly(data = df, y = ~reorder(.data[[var]], table(.data[[var]])[.data[[var]]]),
                                type = "bar", orientation = "h", marker = list(color = "steelblue")) %>%
                                layout(title = paste("Distribution of", var), xaxis = list(title = "Count"),
                                       yaxis = list(title = var))}

transaction_plot <- create_interactive_barplot("transaction_type")
party_plot <- create_interactive_barplot("party")
state_plot <- create_interactive_barplot("state")
gender_plot <- create_interactive_barplot("gender")

transaction_plot
party_plot
state_plot
gender_plot

Create columns to define success

define_trade_success = function(row) {
                          if (row$transaction_type %in% c("Purchase")) {
                            return(ifelse(row$price_change_pct > 0, 1, 0))} 
                          else if (row$transaction_type %in% c("Sale", "Sale (Partial)", "Sale (Full)")) {
                            return(ifelse(row$price_change_pct < 0, 1, 0))}                       
                          else {return(NA)}}

df = df %>%
     rowwise() %>%
     mutate(trade_success = define_trade_success(cur_data())) %>%
     ungroup() %>%
     filter(!is.na(trade_success))  

define_market_success = function(row) {
                            if (row$transaction_type %in% c("Purchase")) {
                              return(ifelse(row$excess_return > 0, 1, 0))} 
                            else if (row$transaction_type %in% c("Sale", "Sale (Partial)", "Sale (Full)")) {
                              return(ifelse(row$excess_return < 0 & row$price_change_pct < 0, 1, 0))} 
                            else {return(NA)}}

df = df %>%
     rowwise() %>%
     mutate(market_success = define_market_success(cur_data())) %>%
     ungroup() %>%
     filter(!is.na(market_success)) 

Visualization for distribution of success

success_colors = c("0" = "red", "1" = "green")

success_plot = ggplot(df, aes(x = factor(trade_success), 
                              fill = factor(trade_success))) + geom_bar() +
                              scale_fill_manual(values = success_colors, labels = c("0: Unsuccessful", "1: Successful")) +
                              labs(title = "Distribution of Trade Success Outcomes",
                                   x = "Trade Success", y = "Count", fill = "Outcome") +
                              theme(plot.title = element_text(hjust = 0.5, size = 14),
                                    axis.text = element_text(size = 10),
                                    axis.title = element_text(size = 12),
                                    legend.position = "top")

market_success_plot = ggplot(df, aes(x = factor(market_success), 
                                      fill = factor(market_success))) + geom_bar() +
                              scale_fill_manual(values = success_colors, labels = c("0: Unsuccessful", "1: Successful")) +
                                      labs(title = "Distribution of Market Success Outcomes",
                                           x = "Market Success", y = "Count", fill = "Outcome") +
                                      theme(plot.title = element_text(hjust = 0.5, size = 14),
                                            axis.text = element_text(size = 10),
                                            axis.title = element_text(size = 12),
                                            legend.position = "top" )

grid.arrange(success_plot, market_success_plot, ncol = 2)

Committee Success Rates with Confidence Intervals

z = qnorm(0.975) 

compute_confidence_interval = function(successes, total) {
                                  p = successes / total
                                  se = sqrt(p * (1 - p) / total)
                                  lower = max(0, p - z * se) 
                                  upper = min(1, p + z * se)  
                                  return(list(proportion = p, lower_bound = lower, upper_bound = upper))}

committee_columns = grep("committee_", colnames(df), value = TRUE)

committee_success_rates = data.frame(committee = character(),
                                     success_rate = numeric(),
                                     lower_ci = numeric(),
                                     upper_ci = numeric(),
                                     total_trades = integer(),
                                     stringsAsFactors = FALSE)

for (committee in committee_columns) {committee_name = gsub("committee_", "", committee)  
                                      committee_data = df %>% filter(.data[[committee]] == 1)
                                      total_trades = nrow(committee_data)
                                      successes = nrow(committee_data %>% filter(trade_success == 1))
                                      
                                      if (total_trades > 0) {
                                        ci = compute_confidence_interval(successes, total_trades)
                                        committee_success_rates = rbind(committee_success_rates,
                                                                        data.frame(committee = committee_name,
                                                                                   success_rate = ci$proportion,
                                                                                   lower_ci = ci$lower_bound,
                                                                                   upper_ci = ci$upper_bound,
                                                                                   total_trades = total_trades))} 
                                      else {committee_success_rates = rbind(committee_success_rates,
                                                                            data.frame(committee = committee_name,
                                                                                       success_rate = NA,
                                                                                       lower_ci = NA,
                                                                                       upper_ci = NA,
                                                                                       total_trades = total_trades))}}

committee_success_rates = committee_success_rates %>% filter(!is.na(success_rate))
committee_success_rates = committee_success_rates %>% arrange(desc(success_rate))

ggplot(committee_success_rates, aes(x = success_rate, y = reorder(committee, success_rate))) +
                                geom_bar(stat = "identity", fill = "skyblue", width = 0.8) +
                                geom_errorbar(aes(xmin = lower_ci, xmax = upper_ci),
                                              width = 0.2, color = "black") +
                                labs(title = "Success Rate by Committee with Confidence Intervals",
                                     x = "Success Rate", y = "Committee") + theme_minimal() +
                                theme(plot.title = element_text(hjust = 0.5, size = 16, margin = margin(b = 10)),
                                      axis.text = element_text(size = 10),
                                      axis.title = element_text(size = 12))

max_trades = max(committee_success_rates$total_trades, na.rm = TRUE)
                 committee_success_rates = committee_success_rates %>%
                 mutate(bubble_size = total_trades / max_trades * 1000)  

ggplot(committee_success_rates, aes(x = success_rate, y = reorder(committee, success_rate), size = bubble_size)) +
                                geom_point(alpha = 0.6, color = "steelblue") +  
                                scale_size_continuous(range = c(3, 20), 
                                                      name = "Number of Trades",  
                                                      breaks = c(100, 500, 1000),  
                                                      labels = c("100 trades", "500 trades", "1000 trades")) +
                                labs(title = "Success Rate by Committee (Bubble Size = Number of Trades)",
                                     x = "Success Rate", y = "Committee") +
                                theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
                                                        axis.text = element_text(size = 10),
                                                        axis.title = element_text(size = 12),
                                                        legend.position = "right" )

min_trades = 30

filtered_committee_df = committee_success_rates %>%
                        filter(total_trades >= min_trades)

ggplot(filtered_committee_df, aes(x = success_rate, y = reorder(committee, success_rate))) +
                              geom_bar(stat = "identity", fill = "skyblue", width = 0.8) + 
                              labs(title = paste("Success Rate by Committee (Minimum", min_trades, "Trades)"),
                                   x = "Success Rate", y = "Committee") + theme_minimal() +
                              theme(plot.title = element_text(hjust = 0.5, size = 16, margin = margin(b = 10)),
                                    axis.text = element_text(size = 10),
                                    axis.title = element_text(size = 12),
                                    plot.margin = margin(t = 20, r = 20, b = 20, l = 20))

min_trades = 30

sector_success_rates = df %>% filter(!is.na(sector)) %>%
                       group_by(sector) %>%
                       summarise(total_trades = n(),
                                 successes = sum(trade_success, na.rm = TRUE),
                                 .groups = "drop") %>%
                       rowwise() %>%
                       mutate(ci = list(compute_confidence_interval(successes, total_trades)),
                              success_rate = ci$proportion,
                              lower_ci = ci$lower_bound,
                              upper_ci = ci$upper_bound) %>%
                       unnest_wider(ci) %>%
                       filter(total_trades >= min_trades) 

ggplot(sector_success_rates, aes(x = success_rate, y = reorder(sector, success_rate))) +
                             geom_bar(stat = "identity", fill = "skyblue", width = 0.8) +
                             geom_errorbar(aes(xmin = lower_ci, xmax = upper_ci),
                                           width = 0.2, color = "black") +
                             geom_text(aes(label = paste0(total_trades, " trades"), x = success_rate + 0.01),
                                       hjust = 0, size = 3) +
                             labs(title = "Success Rate by Sector with Confidence Intervals",
                             x = "Success Rate", y = "Sector") +
                             theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
                                                     axis.text = element_text(size = 10),
                                                     axis.title = element_text(size = 12),
                                                     plot.margin = margin(t = 20, r = 20, b = 20, l = 20))

ggplot(df, aes(x = seniority_years, y = trade_success)) +
           geom_jitter(width = 0.2, height = 0.05, color = "blue", alpha = 0.6) +
           labs(title = "Success vs. Seniority Years",
                x = "Seniority (Years)", y = "Success (1 = Successful Trade)") +
           theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14),
                                   axis.text = element_text(size = 10),
                                   axis.title = element_text(size = 12))

ggplot(df, aes(x = factor(trade_success), y = seniority_years)) +
           geom_boxplot(fill = "skyblue", color = "black", width = 0.5) +
           labs(title = "Seniority Years by Success",
                x = "Success (0 = Unsuccessful, 1 = Successful)",
                y = "Seniority (Years)") +
           theme_minimal() +
           theme(plot.title = element_text(hjust = 0.5, size = 14),
                 axis.text = element_text(size = 10),
                 axis.title = element_text(size = 12))

party_success = df %>%
                 group_by(party) %>%
                 summarise(average_success_rate = mean(trade_success, na.rm = TRUE),
                           .groups = "drop")

ggplot(party_success, aes(x = party, y = average_success_rate)) +
                      geom_bar(stat = "identity", fill = "skyblue", color = "black") +
                      labs(title = "Success Rate by Party", x = "Party", y = "Average Success Rate") +
                      theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 14),
                                              axis.text = element_text(size = 10),
                                              axis.title = element_text(size = 12))

pairplot_vars = c("trade_price", "price_change_pct", "seniority_years", "excess_return", "trade_success")

pairplot_data = df %>%
                select(all_of(pairplot_vars)) %>%
                mutate(trade_success = as.factor(trade_success))  

ggpairs(pairplot_data, aes(color = trade_success, alpha = 0.6),
                       lower = list(continuous = wrap("points", alpha = 0.4)),
                       upper = list(continuous = wrap("cor", size = 4)),       
                       diag = list(continuous = wrap("densityDiag", alpha = 0.4))) +
                       theme_minimal() + theme(legend.position = "bottom",  
                                               plot.title = element_text(hjust = 0.5, size = 14)) +
                       labs(title = "Pair Plot of Selected Variables by Trade Success")

df = df %>%
     mutate(party_encoded = as.numeric(as.factor(party)),  
            gender_encoded = as.numeric(as.factor(gender)))

corr_vars = c("trade_price", "price_change_pct", "seniority_years", 
              "excess_return", "trade_success", "party_encoded", "gender_encoded")

subset_df = df %>% select(all_of(corr_vars))
corr_matrix = cor(subset_df, use = "complete.obs")

ggcorrplot(corr_matrix,
           method = "square",           
           type = "full",              
           lab = TRUE,                
           lab_size = 4,                
           colors = c("red", "white", "blue"),  
           title = "Correlation Matrix Including Encoded Categorical Variables",
           legend.title = "Correlation") + theme_minimal() +
           theme(plot.title = element_text(hjust = 0.5, size = 16),  
                 legend.position = "right",                         
                 axis.text.x = element_text(angle = 45, hjust = 1))

committee_columns = grep("committee_", names(df), value = TRUE)
committee_df = df %>% select(all_of(committee_columns))

committee_corr = cor(committee_df, use = "complete.obs")

colnames(committee_corr) = abbreviate(colnames(committee_corr), minlength = 15)
rownames(committee_corr) = abbreviate(rownames(committee_corr), minlength = 15)

ggcorrplot(committee_corr,
           method = "square",          
           type = "full",               
           lab = FALSE,                
           colors = c("red", "white", "blue"),  
           title = "Correlation Matrix of Committee Memberships",
           legend.title = "Correlation") +
           theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),  
                                   legend.position = "right",                     
                                   axis.text.x = element_text(size = 8, angle = 90, hjust = 1), 
                                   axis.text.y = element_text(size = 8))

df = df %>%
     mutate(num_committees = rowSums(select(., starts_with("committee_")), na.rm = TRUE))

success_by_committees <- df %>%
                         group_by(num_committees) %>%
                         summarise(average_success_rate = mean(trade_success, na.rm = TRUE),
                                   .groups = "drop")

ggplot(success_by_committees, aes(x = num_committees, y = average_success_rate)) +
                              geom_bar(stat = "identity", fill = "skyblue", color = "black") +
                              labs(title = "Success Rate by Number of Committee Memberships",
                                   x = "Number of Committees", y = "Average Success Rate") +
       theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
                               axis.text = element_text(size = 10),
                               axis.title = element_text(size = 12))

df = df %>%
     mutate(trade_date = as.Date(trade_date))  

df = df %>%
     mutate(trade_year = format(trade_date, "%Y"))  

trades_per_year = df %>%
                  group_by(trade_year) %>%
                  summarise(num_trades = n(), .groups = "drop")

ggplot(trades_per_year, aes(x = as.numeric(trade_year), y = num_trades)) +
                        geom_line(group = 1, color = "blue", size = 1) +
                        geom_point(color = "red", size = 2) +
                        labs(title = "Number of Trades Over Time",
                             x = "Year", y = "Number of Trades") +
       theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
                               axis.text = element_text(size = 10),
                               axis.title = element_text(size = 12))

df = df %>%
     mutate(trade_date = as.Date(trade_date),        
            trade_year = format(trade_date, "%Y")) 

success_per_year = df %>%
                   group_by(trade_year) %>%
                   summarise(avg_success_rate = mean(trade_success, na.rm = TRUE), 
                             .groups = "drop")

ggplot(success_per_year, aes(x = as.numeric(trade_year), y = avg_success_rate)) +
                         geom_line(group = 1, color = "blue", size = 1) +
                         geom_point(color = "red", size = 2) +
                         labs(title = "Success Rate Over Time",
                              x = "Year", y = "Average Success Rate") +
       theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
                               axis.text = element_text(size = 10),
                               axis.title = element_text(size = 12))

sector_success = df %>%
                 group_by(sector) %>%
                 summarise(avg_success_rate = mean(trade_success, na.rm = TRUE), 
                           .groups = "drop")

ggplot(sector_success, aes(x = avg_success_rate, y = reorder(sector, avg_success_rate))) +
                       geom_bar(stat = "identity", fill = "darkorange", color = "black") +
                       labs(title = "Success Rate by Sector", x = "Average Success Rate", y = "Sector") +
       theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),  
                               axis.text = element_text(size = 10),               
                               axis.title = element_text(size = 12))

sector_counts = df %>%
                group_by(sector) %>%
                summarise(num_trades = n(), .groups = "drop")

ggplot(sector_counts, aes(x = num_trades, y = reorder(sector, num_trades))) +
                      geom_bar(stat = "identity", fill = "purple", color = "black") +
                      labs(title = "Number of Trades by Sector", x = "Number of Trades", y = "Sector") +
       theme_minimal() + theme(plot.title = element_text(hjust = 0.5, size = 16),
                               axis.text = element_text(size = 10),               
                               axis.title = element_text(size = 12))